#! /bin/bash -x
# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.

set -eu

RUN_DIR=/run/nvidia
PID_FILE=${RUN_DIR}/${0##*/}.pid
DRIVER_VERSION=${DRIVER_VERSION:?"Missing driver version"}
KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver
NUM_VGPU_DEVICES=0

_update_package_cache() {
    if [ "${PACKAGE_TAG:-}" != "builtin" ]; then
        echo "Updating the package cache..."
        yum -q makecache
    fi
}

_cleanup_package_cache() {
    if [ "${PACKAGE_TAG:-}" != "builtin" ]; then
        echo "Cleaning up the package cache..."
        rm -rf /var/cache/yum/*
    fi
}

# Resolve the kernel version to the form major.minor.patch-revision.
_resolve_kernel_version() {
    local version=$(yum -q list available --showduplicates kernel-headers |
      awk -v arch=$(uname -m) 'NR>1 {print $2"."arch}' | tac | grep -E -m1 "^${KERNEL_VERSION/latest/.*}")

    echo "Resolving Linux kernel version..."
    if [ -z "${version}" ]; then
        echo "Could not resolve Linux kernel version" >&2
        return 1
    fi
    KERNEL_VERSION="${version}"
    echo "Proceeding with Linux kernel version ${KERNEL_VERSION}"
    return 0
}

# Install the kernel modules header/builtin/order files and generate the kernel version string.
_install_prerequisites() (
    local tmp_dir=$(mktemp -d)

    trap "rm -rf ${tmp_dir}" EXIT
    cd ${tmp_dir}

    echo "Installing elfutils..."
    dnf install -q -y elfutils-libelf.x86_64 elfutils-libelf-devel.x86_64

    rm -rf /lib/modules/${KERNEL_VERSION}
    mkdir -p /lib/modules/${KERNEL_VERSION}/proc

    echo "Enabling RHOCP and EUS RPM repos..."
    dnf config-manager --set-enabled rhocp-${OPENSHIFT_VERSION}-for-rhel-8-x86_64-rpms || true
    if ! dnf makecache --releasever=${RHEL_VERSION}; then
    	dnf config-manager --set-disabled rhocp-${OPENSHIFT_VERSION}-for-rhel-8-x86_64-rpms
    fi

    dnf config-manager --set-enabled rhel-8-for-x86_64-baseos-eus-rpms  || true
    if ! dnf makecache --releasever=${RHEL_VERSION}; then
    	dnf config-manager --set-disabled rhel-8-for-x86_64-baseos-eus-rpms
    fi

    echo "Installing Linux kernel headers..."
    dnf -q -y --releasever=${RHEL_VERSION} install kernel-headers-${KERNEL_VERSION} kernel-devel-${KERNEL_VERSION} > /dev/null
    ln -s /usr/src/kernels/${KERNEL_VERSION} /lib/modules/${KERNEL_VERSION}/build

    echo "Installing Linux kernel module files..."
    dnf -q -y --releasever=${RHEL_VERSION} install kernel-core-${KERNEL_VERSION} > /dev/null

    # Prevent depmod from giving a WARNING about missing files 
    touch /lib/modules/${KERNEL_VERSION}/modules.order
    touch /lib/modules/${KERNEL_VERSION}/modules.builtin

    depmod ${KERNEL_VERSION}

    echo "Generating Linux kernel version string..."
    extract-vmlinux /lib/modules/${KERNEL_VERSION}/vmlinuz | strings | grep -E '^Linux version' | sed 's/^\(.*\)\s\+(.*)$/\1/' > version
    if [ -z "$(<version)" ]; then
        echo "Could not locate Linux kernel version string" >&2
        return 1
    fi
    mv version /lib/modules/${KERNEL_VERSION}/proc

    # Parse gcc version
    # gcc_version is expected to match x.y.z
    # current_gcc is expected to match 'gcc-x.y.z-rel.el8.x86_64
    local gcc_version=$(cat /lib/modules/${KERNEL_VERSION}/proc/version | grep -Eo "gcc version ([0-9\.]+)" | grep -Eo "([0-9\.]+)")
    local current_gcc=$(rpm -qa gcc)
    echo "kernel requires gcc version: 'gcc-${gcc_version}', current gcc version is '${current_gcc}'"

    if [[ "gcc-${gcc_version}" != "${current_gcc}"* ]]; then
        dnf install -q -y --releasever=${RHEL_VERSION} "gcc-${gcc_version}"
    fi
)

# Cleanup the prerequisites installed above.
_remove_prerequisites() {
    true
    if [ "${PACKAGE_TAG:-}" != "builtin" ]; then
        dnf -q -y remove kernel-headers-${KERNEL_VERSION} kernel-devel-${KERNEL_VERSION} > /dev/null
        # TODO remove module files not matching an existing driver package.
    fi
}

# Check if the kernel version requires a new precompiled driver packages.
_kernel_requires_package() {
    local proc_mount_arg=""

    echo "Checking NVIDIA driver packages..."

    [[ ! -d /usr/src/nvidia-${DRIVER_VERSION}/kernel ]] && return 0
    cd /usr/src/nvidia-${DRIVER_VERSION}/kernel

    proc_mount_arg="--proc-mount-point /lib/modules/${KERNEL_VERSION}/proc"
    for pkg_name in $(ls -d -1 precompiled/** 2> /dev/null); do
        is_match=$(../mkprecompiled --match ${pkg_name} ${proc_mount_arg})
        if [ "${is_match}" == "kernel interface matches." ]; then
            echo "Found NVIDIA driver package ${pkg_name##*/}"
            return 1
        fi
    done
    return 0
}

# Compile the kernel modules, optionally sign them, and generate a precompiled package for use by the nvidia-installer.
_create_driver_package() (
    local pkg_name="nvidia-modules-${KERNEL_VERSION%%-*}${PACKAGE_TAG:+-${PACKAGE_TAG}}"
    local nvidia_sign_args=""
    local nvidia_modeset_sign_args=""
    local nvidia_uvm_sign_args=""

    trap "make -s -j SYSSRC=/lib/modules/${KERNEL_VERSION}/build clean > /dev/null" EXIT

    echo "Compiling NVIDIA driver kernel modules..."
    cd /usr/src/nvidia-${DRIVER_VERSION}/kernel
    make -s -j SYSSRC=/lib/modules/${KERNEL_VERSION}/build nv-linux.o nv-modeset-linux.o > /dev/null

    echo "Relinking NVIDIA driver kernel modules..."
    rm -f nvidia.ko nvidia-modeset.ko
    ld -d -r -o nvidia.ko ./nv-linux.o ./nvidia/nv-kernel.o_binary
    ld -d -r -o nvidia-modeset.ko ./nv-modeset-linux.o ./nvidia-modeset/nv-modeset-kernel.o_binary

    if [ -n "${PRIVATE_KEY}" ]; then
        echo "Signing NVIDIA driver kernel modules..."
        donkey get ${PRIVATE_KEY} sh -c "PATH=${PATH}:/usr/src/linux-headers-${KERNEL_VERSION}/scripts && \
          sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia.ko nvidia.ko.sign &&                          \
          sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia-modeset.ko nvidia-modeset.ko.sign &&          \
          sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia-uvm.ko"
        nvidia_sign_args="--linked-module nvidia.ko --signed-module nvidia.ko.sign"
        nvidia_modeset_sign_args="--linked-module nvidia-modeset.ko --signed-module nvidia-modeset.ko.sign"
        nvidia_uvm_sign_args="--signed"
    fi

    echo "Building NVIDIA driver package ${pkg_name}..."
    ../mkprecompiled --pack ${pkg_name} --description ${KERNEL_VERSION}                              \
                                        --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc       \
                                        --driver-version ${DRIVER_VERSION}                           \
                                        --kernel-interface nv-linux.o                                \
                                        --linked-module-name nvidia.ko                               \
                                        --core-object-name nvidia/nv-kernel.o_binary                 \
                                        ${nvidia_sign_args}                                          \
                                        --target-directory .                                         \
                                        --kernel-interface nv-modeset-linux.o                        \
                                        --linked-module-name nvidia-modeset.ko                       \
                                        --core-object-name nvidia-modeset/nv-modeset-kernel.o_binary \
                                        ${nvidia_modeset_sign_args}                                  \
                                        --target-directory .                                         \
                                        --kernel-module nvidia-uvm.ko                                \
                                        ${nvidia_uvm_sign_args}                                      \
                                        --target-directory .
    mkdir -p precompiled
    mv ${pkg_name} precompiled
)

# Load the kernel modules and start persistenced.
_load_driver() {
    echo "Loading IPMI kernel module..."
    modprobe ipmi_msghandler

    echo "Loading NVIDIA driver kernel modules..."
    modprobe -a nvidia nvidia-uvm nvidia-modeset

    echo "Starting NVIDIA persistence daemon..."
    nvidia-persistenced --persistence-mode

    if [ "${DRIVER_TYPE}" = "vgpu" ]; then
        if [ "${VGPU_LICENSE_SERVER_TYPE}" = "FNE" ]; then
            echo "Copying gridd.conf..."
            cp /drivers/gridd.conf /etc/nvidia/gridd.conf
        elif [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then
            echo "Copying MessengerToken..."
            mkdir -p  /etc/nvidia/MessengerToken/
            cp /drivers/MessengerToken/* /etc/nvidia/MessengerToken/
        fi

        echo "Starting nvidia-gridd.."
        LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd
    fi
}

# Stop persistenced and unload the kernel modules if they are currently loaded.
_unload_driver() {
    local rmmod_args=()
    local nvidia_deps=0
    local nvidia_refs=0
    local nvidia_uvm_refs=0
    local nvidia_modeset_refs=0

    echo "Stopping NVIDIA persistence daemon..."
    if [ -f /var/run/nvidia-persistenced/nvidia-persistenced.pid ]; then
        local pid=$(< /var/run/nvidia-persistenced/nvidia-persistenced.pid)

        kill -SIGTERM "${pid}"
        for i in $(seq 1 10); do
            kill -0 "${pid}" 2> /dev/null || break
            sleep 0.1
        done
        if [ $i -eq 10 ]; then
            echo "Could not stop NVIDIA persistence daemon" >&2
            return 1
        fi
    fi

    if [ -f /var/run/nvidia-gridd/nvidia-gridd.pid ]; then
        echo "Stopping NVIDIA grid daemon..."
        local pid=$(< /var/run/nvidia-gridd/nvidia-gridd.pid)

        kill -SIGTERM "${pid}"
        for i in $(seq 1 10); do
            kill -0 "${pid}" 2> /dev/null || break
            sleep 0.1
        done
        if [ $i -eq 10 ]; then
            echo "Could not stop NVIDIA Grid daemon" >&2
            return 1
        fi
    fi

    echo "Unloading NVIDIA driver kernel modules..."
    if [ -f /sys/module/nvidia_modeset/refcnt ]; then
        nvidia_modeset_refs=$(< /sys/module/nvidia_modeset/refcnt)
        rmmod_args+=("nvidia-modeset")
        ((++nvidia_deps))
    fi
    if [ -f /sys/module/nvidia_uvm/refcnt ]; then
        nvidia_uvm_refs=$(< /sys/module/nvidia_uvm/refcnt)
        rmmod_args+=("nvidia-uvm")
        ((++nvidia_deps))
    fi
    if [ -f /sys/module/nvidia/refcnt ]; then
        nvidia_refs=$(< /sys/module/nvidia/refcnt)
        rmmod_args+=("nvidia")
    fi
    if [ ${nvidia_refs} -gt ${nvidia_deps} ] || [ ${nvidia_uvm_refs} -gt 0 ] || [ ${nvidia_modeset_refs} -gt 0 ]; then
        echo "Could not unload NVIDIA driver kernel modules, driver is in use" >&2
        return 1
    fi

    if [ ${#rmmod_args[@]} -gt 0 ]; then
        rmmod ${rmmod_args[@]}
    fi
    return 0
}

# Link and install the kernel modules from a precompiled package using the nvidia-installer.
_install_driver() {
    local install_args=()

    echo "Installing NVIDIA driver kernel modules..."
    cd /usr/src/nvidia-${DRIVER_VERSION}
    rm -rf /lib/modules/${KERNEL_VERSION}/video

    if [ "${ACCEPT_LICENSE}" = "yes" ]; then
        install_args+=("--accept-license")
    fi
    IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check ${install_args[@]+"${install_args[@]}"}
    # May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path
    # /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point
    # TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit
    #nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check --no-cc-version-check --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc ${install_args[@]+"${install_args[@]}"}
}

# Mount the driver rootfs into the run directory with the exception of sysfs.
_mount_rootfs() {
    echo "Mounting NVIDIA driver rootfs..."
    mount --make-runbindable /sys
    mount --make-private /sys
    mkdir -p ${RUN_DIR}/driver
    mount --rbind / ${RUN_DIR}/driver

    echo "Change device files security context for selinux compatibility"
    chcon -R -t container_file_t ${RUN_DIR}/driver/dev
}

# Unmount the driver rootfs from the run directory.
_unmount_rootfs() {
    echo "Unmounting NVIDIA driver rootfs..."
    if findmnt -r -o TARGET | grep "${RUN_DIR}/driver" > /dev/null; then
        umount -l -R ${RUN_DIR}/driver
    fi
}

# Write a kernel postinst.d script to automatically precompile packages on kernel update (similar to DKMS).
_write_kernel_update_hook() {
    if [ ! -d ${KERNEL_UPDATE_HOOK%/*} ]; then
        return
    fi

    echo "Writing kernel update hook..."
    cat > ${KERNEL_UPDATE_HOOK} <<'EOF'
#!/bin/bash

set -eu
trap 'echo "ERROR: Failed to update the NVIDIA driver" >&2; exit 0' ERR

NVIDIA_DRIVER_PID=$(< /run/nvidia/nvidia-driver.pid)

export "$(grep -z DRIVER_VERSION /proc/${NVIDIA_DRIVER_PID}/environ)"
nsenter -t "${NVIDIA_DRIVER_PID}" -m -- nvidia-driver update --kernel "$1"
EOF
    chmod +x ${KERNEL_UPDATE_HOOK}
}

_shutdown() {
    if _unload_driver; then
        _unmount_rootfs
        rm -f ${PID_FILE} ${KERNEL_UPDATE_HOOK}
        return 0
    fi
    return 1
}

_find_vgpu_driver_version() {
    local count=""
    local version=""

    if [ "${DISABLE_VGPU_VERSION_CHECK}" = "true" ]; then
        echo "vgpu version compatibility check is disabled"
        return 0
    fi
    # check if vgpu devices are present
    count=$(vgpu-util count)
    if [ $? -ne 0 ]; then
         echo "cannot find vgpu devices on host, pleae check /var/log/vgpu-util.log for more details..."
         return 0
    fi
    NUM_VGPU_DEVICES=$(echo "$count" | awk -F= '{print $2}')
    if [ $NUM_VGPU_DEVICES -eq 0 ]; then
        # no vgpu devices found, treat as passthrough
        return 0
    fi
    echo "found $NUM_VGPU_DEVICES vgpu devices on host"

    # find compatible guest driver using drive catalog
    version=$(vgpu-util match -i /drivers -c /drivers/vgpuDriverCatalog.yaml)
    if [ $? -ne 0 ]; then
        echo "cannot find match for compatible vgpu driver from available list, please check /var/log/vgpu-util.log for more details..."
        return 1
    fi
    DRIVER_VERSION=$(echo "$version" | awk -F= '{print $2}')
    echo "vgpu driver version selected: ${DRIVER_VERSION}"
    return 0
}

init() {
    if [ "${DRIVER_TYPE}" = "vgpu" ]; then
        _find_vgpu_driver_version || exit 1
    fi

    # Install the userspace components and copy the kernel module sources.
    sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \
        cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \
        sh /tmp/install.sh nvinstall && \
        mkdir -p /usr/src/nvidia-$DRIVER_VERSION && \
        mv LICENSE mkprecompiled kernel /usr/src/nvidia-$DRIVER_VERSION && \
        sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-$DRIVER_VERSION/.manifest

    echo -e "\n========== NVIDIA Software Installer ==========\n"
    echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n"

    exec 3> ${PID_FILE}
    if ! flock -n 3; then
        echo "An instance of the NVIDIA driver is already running, aborting"
        exit 1
    fi
    echo $$ >&3

    trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
    trap "_shutdown" EXIT

    _unload_driver || exit 1
    _unmount_rootfs

    # Install dependencies
    if _kernel_requires_package; then
        _update_package_cache
        _install_prerequisites
        _create_driver_package
        #_remove_prerequisites
        _cleanup_package_cache
    fi

    # Build the driver
    _install_driver
    _load_driver
    _mount_rootfs
    _write_kernel_update_hook

    echo "Done, now waiting for signal"
    sleep infinity &
    trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
    trap - EXIT
    while true; do wait $! || continue; done
    exit 0
}

update() {
    exec 3>&2
    if exec 2> /dev/null 4< ${PID_FILE}; then
        if ! flock -n 4 && read pid <&4 && kill -0 "${pid}"; then
            exec > >(tee -a "/proc/${pid}/fd/1")
            exec 2> >(tee -a "/proc/${pid}/fd/2" >&3)
        else
            exec 2>&3
        fi
        exec 4>&-
    fi
    exec 3>&-

    # vgpu driver version is choosen dynamically during runtime, so pre-compile modules for
    # only non-vgpu driver types
    if [ "${DRIVER_TYPE}" != "vgpu" ]; then
        # Install the userspace components and copy the kernel module sources.
        if [ ! -e /usr/src/nvidia-${DRIVER_VERSION}/mkprecompiled ]; then
            sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \
                cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \
                sh /tmp/install.sh nvinstall && \
                mkdir -p /usr/src/nvidia-$DRIVER_VERSION && \
                mv LICENSE mkprecompiled kernel /usr/src/nvidia-$DRIVER_VERSION && \
                sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-$DRIVER_VERSION/.manifest
        fi
    fi

    echo -e "\n========== NVIDIA Software Updater ==========\n"
    echo -e "Starting update of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n"

    trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM

    _update_package_cache
    _resolve_kernel_version || exit 1
    _install_prerequisites
    if _kernel_requires_package; then
        _create_driver_package
    fi
    _remove_prerequisites
    _cleanup_package_cache

    echo "Done"
    exit 0
}

usage() {
    cat >&2 <<EOF
Usage: $0 COMMAND [ARG...]

Commands:
  init   [-a | --accept-license]
  update [-k | --kernel VERSION] [-s | --sign KEYID] [-t | --tag TAG]
EOF
    exit 1
}

if [ $# -eq 0 ]; then
    usage
fi
command=$1; shift
case "${command}" in
    init) options=$(getopt -l accept-license -o a -- "$@") ;;
    update) options=$(getopt -l kernel:,sign:,tag: -o k:s:t: -- "$@") ;;
    *) usage ;;
esac
if [ $? -ne 0 ]; then
    usage
fi
eval set -- "${options}"

ACCEPT_LICENSE=""
KERNEL_VERSION=$(uname -r)
PRIVATE_KEY=""
PACKAGE_TAG=""

for opt in ${options}; do
    case "$opt" in
    -a | --accept-license) ACCEPT_LICENSE="yes"; shift 1 ;;
    -k | --kernel) KERNEL_VERSION=$2; shift 2 ;;
    -s | --sign) PRIVATE_KEY=$2; shift 2 ;;
    -t | --tag) PACKAGE_TAG=$2; shift 2 ;;
    --) shift; break ;;
    esac
done
if [ $# -ne 0 ]; then
    usage
fi

$command
